# coding = utf-8
import sys
from bs4 import BeautifulSoup  # 网页解析，获取数据
import re  # 正则表达式,进行文字匹配
import urllib.request  # 指定URL，获取网页数据
import xlwt  # 进行excel操作


def main():
    baseurl = 'https://tiyu.baidu.com/tokyoly/home/tab/%E5%A5%96%E7%89%8C%E6%A6%9C/from/pc'
    datalist = getData(baseurl)
    saveData(datalist)
    askURL(baseurl)
    getData(baseurl)


findName = re.compile(r'<span class="name" data-a-588dce74="">(.*)</span>')  # 国家姓名
findNum = re.compile(r'<div class="item-(.*)?" data-a-588dce74="">(\d+)?')  # 奖牌数


# 获取数据
def getData(baseurl):
    datalist = []
    text = askURL(baseurl)
    soup = BeautifulSoup(text, 'html.parser')
    for item in soup.find_all('div', class_=r"medallist"):
        data = []
        item = str(item)

        # 在data中加入国家
        name = re.findall(findName, item)
        data.append(name)

        # 在data中加入奖牌数
        frienNum = re.findall(findNum, item)
        frienNum = re.findall('\d+', str(frienNum))
        for i in range(0, len(frienNum), 4):
            data.append(frienNum[i:i + 4])

    return data


# 存储数据
def saveData(datalist):
    workBook = xlwt.Workbook(encoding='utf-8')
    worksheet = workBook.add_sheet('奖牌榜', cell_overwrite_ok=True)
    top = ['国家', '金牌', '银牌', '铜牌', '总数']
    for num in range(5):
        worksheet.write(0, num, top[num])
    for i in range(len(datalist[0])):
            worksheet.write(i+1, 0, datalist[0][i])      # 将93个国家写入
    for i in range(1, len(datalist[0])+1):         # 行数
        for j in range(1, 5):       # 列数
            worksheet.write(i, j, int(datalist[i][j-1]))
    workBook.save('奖牌榜.xls')


# 得到指定url网页的数据
def askURL(url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36 Edg/92.0.902.62'
    }  # 用户代理，表示告诉服务器我们是什么类型的机器
    req = urllib.request.Request(url, headers=headers)
    html = ''
    try:
        response = urllib.request.urlopen(req)
        html = response.read().decode('utf-8')
    except urllib.error.URLError as e:
        if hasattr(e, 'code'):
            print(e.code)
        if hasattr(e, 'reason'):
            print(e.reason)

    return html


if __name__ == '__main__':
    main()
